////////////////////////////////////////////////////////////////////////////////

*****************************Clean UK data**************************************

////////////////////////////////////////////////////////////////////////////////

cd "$ukdata"

////////////////////////////////////////////////////////////////////////////////
*BHPS data (1991-2008)*
////////////////////////////////////////////////////////////////////////////////

*Unchanging characteristics
use "raw/xwavedat_bhps.dta", clear
	gen male=1 if sex==1
	replace male=0 if sex==2
	
	gen immigrant=1 if yr2uk4>0&yr2uk4<.
	recode immigrant .=0
	
	gen nonwhite=1 if (race!=.&race>1&race!=.)|(racel!=.&racel>5&racel<.)
	replace nonwhite=0 if race==1|inrange(racel,1,5)
	
	keep pid pidp male immig nonwhite doby
save "temp/xwave_bhps.dta", replace

*Prepare first set of BHPS waves to be combined
use "raw/aindresp.dta", clear
	rename a* *
	rename hgest hgemp
	gen wave="a"
save "temp/bhps_wave_a.dta", replace

local waves_g1 b c d e f

foreach x of local waves_g1 {
	use "raw/`x'indresp.dta", clear
		rename `x'* *
		gen wave="`x'"
		rename doiy year
	save "temp/bhps_wave_`x'.dta", replace
	}

*Prepare second set of waves	
local waves_g2_1 g h i j k l m n o p q r

foreach x of local waves_g2_1 {
	use raw/`x'indresp.dta, clear
		rename `x'* *
		gen wave="`x'"
		rename jbbgy4 jbbgy
		rename jsbgy4 jsbgy
		rename doiy4 year
		cap rename racel race
	save "temp/bhps_wave_`x'.dta", replace
	}

*Issue with individual identifier variable renamed in wave p
use "temp/bhps_wave_p.dta", clear
	rename id pid
save "temp/bhps_wave_p.dta", replace

*Waves from h on have different jbed variables	
local waves_g2_2 h i j k l m n o p q r
foreach x of local waves_g2_2 {
	use "temp/bhps_wave_`x'.dta", clear
		gen jbed=1 if trq1>0|trq2>0|trq3>0
		recode jbed .=0
	save "temp/bhps_wave_`x'.dta", replace
}

*Waves from i on have different jbterm variable names
local waves_g2_3 i j k l m n o p q r
foreach x of local waves_g2_3 {
	use "temp/bhps_wave_`x'.dta", clear
		rename jbterm1 jbterm
	save "temp/bhps_wave_`x'.dta", replace
}

*Combine waves	
clear
local waves_all a b c d e f g h i j k l m n o p q r
foreach x of local waves_all {
	append using "temp/bhps_wave_`x'.dta"
	cap erase "temp/bhps_wave_`x'.dta"
}
	
	replace jbed=0 if jbed!=1
	recode jbterm 3=2
	label define jbterml -9 "missing or wild" -8 "inapplicable" -7 ///
"proxy and/or phone" 1 "Permanent job" 2 "Not permanent job"
	
	label values jbterm jbterml
	keep doid doim isced jsboss j2has j2semp jbsat4 jbsemp jbterm cjsbgd cjsbgm ///
cjsbgy4 jbsect jbopps hgemp jbsat jbed tuin1 tuin2 jbbgy jbbgly jbbgy4 jsbgy4 ///
race jssat2 jsbgy fisit age pid wave jbhas jboff xrwght jbisco_cc sex mlstat ///
hlstat qfhas jbmngr jbhrs paynl jbrgsc qfedhi tujbpl plbornc_cc njbnew orgmb year ///
eprosc
	
	recode year 92=1992 93=1993 94=1994 95=1995 96=1996 97=1997
	replace year=1991 if wave=="a"
	label values year .
	
*Replace jbsat4=jssat2 for self-employed people	
	replace jbsat4=jssat2 if jssat2>0&jssat2!=.
*Rename job security question
	rename jbsat4 bhps_security
*Rename UKHLS security question in waves 6&7
	rename eprosc ukhls_security
*Generate chance to move up dummy
	gen opps=1 if jbopps==1
	recode opps .=0
*Generate dummy for having had training for current job, excluding induction
	gen training=1 if jbed==1
	recode training .=0
*Generate age squared variable
	gen age_squared=age^2/100
*Rename cross-sectional weight variable
	rename xrwght xr_bhps
	label var xr_bhps "Cross-sectional weight, BHPS only"
*Rename union variable
	rename tujbpl union_active
*Rename industry variable
	rename jbisco_cc industry
	label var industry "International SOC: present job"
	gen industry_2d=substr(industry,1,2)
	generate byte non_numeric = indexnot(industry_2d, "0123456789.-")
	gen industry_identifier=industry_2d if non_numeric==0
	drop non_numeric
	gen isco=substr(industry_2d,1,1)
	replace isco="" if isco!="0"&isco!="1"&isco!="2"&isco!="3"&isco!="4"& ///
isco!="5"&isco!="6"&isco!="7"&isco!="8"&isco!="9"
	destring isco, replace

*Generate dummies for categories of workers	
	gen temp=1 if jbterm==2
	recode temp .=0
	label var temp "Non-permanent employee"
	gen perm=1 if jbterm==1
	recode perm .=0
	label var perm "Permanent employee"
	gen employed=1 if hgemp==1|(hgemp==0&jbhas==1)
	recode employed .=0
	label var employed "Has a job"
	gen parttime=1 if jbhrs<35&jbhrs>0&employed==1
	replace parttime=0 if parttime==.&employed==1
	label var parttime "Part-time employee"
	gen semp=1 if jbsemp==2
	recode semp .=0
	label var semp "Self-employed"
	gen semp_boss=1 if jsboss==1
	recode semp_boss .=0
	label var semp_boss "Self-employed and has employees"
	gen secondjob=1 if j2has==1
	recode secondjob .=0
	label var secondjob "Has a second job"
	gen secondjob_semp=1 if j2semp==2
	recode secondjob_semp .=0
	label var secondjob_semp "Second job is self-employed"
	
	rename jbrgsc socialclass
	
*Generate numerical wave variable	
	gen wave1=0
	replace wave1=1 if wave=="a"
	replace wave1=2 if wave=="b"
	replace wave1=3 if wave=="c"
	replace wave1=4 if wave=="d"
	replace wave1=5 if wave=="e"
	replace wave1=6 if wave=="f"
	replace wave1=7 if wave=="g"
	replace wave1=8 if wave=="h"
	replace wave1=9 if wave=="i"
	replace wave1=10 if wave=="j"
	replace wave1=11 if wave=="k"
	replace wave1=12 if wave=="l"
	replace wave1=13 if wave=="m"
	replace wave1=14 if wave=="n"
	replace wave1=15 if wave=="o"
	replace wave1=16 if wave=="p"
	replace wave1=17 if wave=="q"
	replace wave1=18 if wave=="r"
	drop wave
	rename wave1 wave

*Generate job tenure variable
	replace jbbgy=1900+jbbgy if jbbgy<100&jbbgy>0
	replace jsbgy=1900+jsbgy if jsbgy<100&jsbgy>0
	gen jobtenure=year-jbbgy if jbbgy>0
	replace jobtenure=year-jsbgy if jsbgy>0
	replace jobtenure=. if jobtenure<0
	sort pid wave
	xtset pid wave
	forval i=2/18 {
		local max = `i'-1
		forval j=1/`max' {
			replace jobtenure=L`j'.jobtenure+(year-L`j'.year) if (wave==`i'& ///
jobtenure==.&employed==1&L`j'.jobtenure!=.)
			replace jobtenure=0 if inrange(hgemp,2,5)&wave==`i'
		}
	}
	replace jobtenure=0 if inrange(hgemp,2,5)&wave==1
	xtset, clear
	
*Unified education variable	
	rename qfedhi education_bhps
	gen unified_education=.
	replace unified_education=1 if inrange(education_bhps,1,5)
	replace unified_education=2 if inrange(education_bhps,6,9)
	replace unified_education=3 if education_bhps>9&!missing(education_bhps)
	cap label define education_levels 1 "University/professional degree" 2 ///
"School qualification (e.g. A/O Level)" 3 "Less than school qualification"
	label values unified_education education_levels
	
	gen isced_97=isced
	replace isced_97=3 if isced==4
	replace isced_97=5 if isced==6
	replace isced_97=6 if isced==7
	rename (isced isced_97) (isced_original isced)	
	label define isced 1 "Primary" 2 "Lower secondary" 3 "Upper secondary" 4 ///
"Upper secondary (vocational)" 5 "First degree" 6 "Higher degree"
	label values isced isced
	
	xtset pid wave
	forval i=2/18 {
		local max = `i'-1
		forval j=1/`max' {
			replace unified_education=L`j'.unified_education if wave==`i'& ///
(unified_education==.|unified_education==0)&L`j'.unified_education!=.
			replace isced=L`j'.isced if wave==`i'&(isced==.|isced<0)& ///
L`j'.isced!=.&L`j'.isced>0
			replace unified_education=F`j'.unified_education if wave==`i'& ///
(unified_education==.|unified_education==0)&F`j'.unified_education!=.
			replace isced=F`j'.isced if wave==`i'&(isced==.|isced<0)& ///
F`j'.isced!=.&F`j'.isced>0
		}
	}
		replace unified_education=F.unified_education if wave==1& ///
(unified_education==.|unified_education==0)&F.unified_education!=.
		replace isced=F.isced if wave==1&(isced==.|isced<0)& ///
F.isced!=.&F.isced>0
	xtset, clear

*Marital status
	gen marital=mlstat
	replace marital=. if mlstat<0|mlstat>5
	
*Union variable
	gen member_union=1 if tuin1==1|tuin2==1|orgmb==1
	replace member_union=0 if tuin1==2|tuin2==2
	replace member_union=0 if employed==0
	sort pid wave
	xtset pid wave
	forval i=2/18 {
		local max = `i'-1
		forval j=1/`max' {
			replace member_union=L`j'.member_union if wave==`i'&member_union==.& ///
employed==1&njbnew==1
			replace member_union=0 if employed==0&wave==`i'
		}
	}
	rename member_union union
	replace union=0 if union==.&employed==1
	xtset, clear
	
	rename xr_bhps xw
	
merge m:1 pid using "temp/xwave_bhps.dta"
keep if _merge==3
drop _merge

cap erase "temp/xwave_bhps.dta"
	
save "temp/bhps_all_waves.dta", replace

////////////////////////////////////////////////////////////////////////////////
*UKHLS data (2008-present)
////////////////////////////////////////////////////////////////////////////////

cd "$ukdata"

*Unchanging characteristics
use "raw/xwavedat_ukhls.dta", clear
	gen male=1 if sex==1
	replace male=0 if sex==2
	
	gen immigrant=1 if yr2uk4>0&yr2uk4<.
	recode immigrant .=0
	
	gen nonwhite=1 if (racel_dv!=.&racel_dv>=5)|(racel_bh!=.&racel_bh>5)| ///
	(race_bh!=.&race_bh>1)
	replace nonwhite=0 if race_bh==1|inrange(racel_dv,1,4)|inrange(racel_bh,1,5)
	
	keep pid pidp male immig nonwhite doby
save "temp/xwave_ukhls.dta", replace

*Prepare first set of UKHLS waves to be combined
local waves1 a b
foreach x of local waves1 {
	use "raw/`x'_indresp.dta", clear
		rename `x'_* *
		rename jbterm1 jbterm
		gen wave="`x'"
		gen union_active=.
		cap gen tujbpl=.
		replace union_active=tujbpl
		cap gen tuin1=.
		keep jbsec* jbsemp jbsat jbterm* jbbgy finnow finfut employ jbhrs jsboss jssize age* pid pidp wave istrtdaty j2has jboff jbhas j2semp jbiindb_dv marstat j2hrs jsprf indinus_xw jbisco88_cc sex jbrgsc qfhigh_dv school union_active ukborn tuin1 racel_dv
*Rename cross-sectional weight variable
		rename indinus_xw xw
	save "temp/ukhls_wave_`x'.dta", replace
}

*Second set of waves
local waves2 c d e f g h
foreach x of local waves2 {
	use "raw/`x'_indresp.dta", clear
		rename `x'_* *
		rename jbterm1 jbterm
		gen wave="`x'"
		gen union_active=.
		cap gen tujbpl=.
		replace union_active=tujbpl
		cap gen tuin1=.
		keep jbsec* jbsemp jbsat jbterm* jbbgy finnow finfut employ jbhrs jsboss jssize age* pid pidp wave istrtdaty j2has jboff jbhas j2semp jbiindb_dv marstat j2hrs jsprf indinub_xw jbisco88_cc sex jbrgsc qfhigh_dv school union_active ukborn tuin1 racel_dv
*Rename cross-sectional weight variable
		rename indinub_xw xw
	save "temp/ukhls_wave_`x'.dta", replace
}
	
*Combine waves
clear
foreach x of local waves1 {
	append using "temp/ukhls_wave_`x'.dta"
	cap erase "temp/ukhls_wave_`x'.dta"
}
foreach x of local waves2 {
	append using "temp/ukhls_wave_`x'.dta"
	cap erase "temp/ukhls_wave_`x'.dta"
}

*Rename some of the source variables	
	rename age_dv Age
	drop age*
	rename Age age
	drop jbsect jbsectpub
	rename jbsec ukhls_security
	rename istrtdaty year
	rename jbrgsc socialclass

*Generate dummies for categories of workers	
	gen temp=1 if jbterm==2
	recode temp .=0
	label var temp "Non-permanent employee"
	gen perm=1 if jbterm==1
	recode perm .=0
	label var perm "Permanent employee"
	gen employed=1 if jbhas==1|jboff==1
	recode employed .=0
	label var employed "Has a job"
	gen parttime=1 if jbhrs<35&jbhrs>0&employed==1
	replace parttime=0 if parttime==.&employed==1
	label var parttime "Part-time employee"
	gen semp=1 if jbsemp==2
	recode semp .=0
	label var semp "Self-employed"
	gen semp_boss=1 if jsboss==1
	recode semp_boss .=0
	label var semp_boss "Self-employed and has employees"
	gen secondjob=1 if j2has==1
	recode secondjob .=0
	label var secondjob "Has a second job"
	gen secondjob_semp=1 if j2semp==2
	recode secondjob_semp .=0
	label var secondjob_semp "Second job is self-employed"
	
*Generate numerical wave variable	
	gen wave1=0
	replace wave1=19 if wave=="a"
	replace wave1=20 if wave=="b"
	replace wave1=21 if wave=="c"
	replace wave1=22 if wave=="d"
	replace wave1=23 if wave=="e"
	replace wave1=24 if wave=="f"
	replace wave1=25 if wave=="g"
	replace wave1=26 if wave=="h"
	drop wave
	rename wave1 wave
	label var wave "Wave"
	
*Generate job tenure variable
	drop if year<0
	gen jobtenure=year-jbbgy if jbbgy>0
	sort pidp wave
	xtset pidp wave
	forval i=20/26 {
		local max = `i'-19
		forval j=1/`max' {
			replace jobtenure=L`j'.jobtenure+(year-L`j'.year) if (wave==`i'&jobtenure==.&employed==1&L`j'.jobtenure!=.&L`j'.jobtenure>=0)
			replace jobtenure=0 if employed==0&wave==`i'
		}
	}
	replace jobtenure=0 if employed==0&wave==19
	xtset, clear

*Single-digit industry identifiers	
	rename jbisco88_cc industry
	label var industry "International SOC: present job"
	label values industry .
	tostring industry, replace
	gen industry_2d=substr(industry,1,2)
	generate byte non_numeric = indexnot(industry_2d, "0123456789.-")
	gen industry_identifier=industry_2d if non_numeric==0
	gen isco=substr(industry_2d,1,1)
	drop if isco!="0"&isco!="1"&isco!="2"&isco!="3"&isco!="4"&isco!="5"&isco!="6"&isco!="7"&isco!="8"&isco!="9"
	destring isco, replace
	
	rename qfhigh_dv education_us

*Generate age squared variable
	gen age_squared=age^2/100
	
*Marital status
	gen marital=1 if marstat==2
	replace marital=2 if marstat==4|marstat==7
	replace marital=3 if marstat==5
	replace marital=4 if marstat==6
	replace marital=5 if marstat==1
		
*Unified education variable
	gen unified_education=.
	replace unified_education=1 if education_us==1|education_us==2|education_us==3|education_us==4|education_us==5|education_us==6
	replace unified_education=2 if education_us==7|education_us==8|education_us==9|education_us==10|education_us==11|education_us==12|education_us==13|education_us==14|education_us==15|education_us==16
	replace unified_education=3 if education_us==96&school!=3
	cap label define education_levels 1 "University/professional degree" 2 "School qualification (e.g. A/O Level)" 3 "Less than school qualification"
	label values unified_education education_levels
	
	xtset pidp wave
	forval i=20/26 {
		local max = `i'-19
		forval j=1/`max' {
			replace unified_education=L`j'.unified_education if wave==`i'&(unified_education==.|unified_education==0)&L`j'.unified_education!=.
			replace unified_education=F`j'.unified_education if wave==`i'&(unified_education==.|unified_education==0)&F`j'.unified_education!=.
		}
	}
		replace unified_education=F.unified_education if wave==1&(unified_education==.|unified_education==0)&F.unified_education!=.
	xtset, clear

*Union variable
	gen member_union=1 if tuin1==1
	replace member_union=0 if tuin1==2
	sort pidp wave
	xtset pidp wave
	forval i=20/26 {
		local max = `i'-19
		forval j=1/`max' {
			replace member_union=L`j'.member_union if wave==`i'&member_union==.&employed==1
			replace union=0 if employed==0&wave==`i'
		}
	}
	rename member_union union
	replace union=0 if union==.&employed==1
	xtset, clear
	
*Merge with unchanging characteristics
merge m:1 pidp using "temp/xwave_ukhls.dta"	
keep if _merge==3
drop _merge

cap erase "temp/xwave_ukhls.dta"

save "temp/ukhls_all_waves.dta", replace

////////////////////////////////////////////////////////////////////////////////
*Combine BHPS with UKHLS
////////////////////////////////////////////////////////////////////////////////
cd "$ukdata"

use "temp/bhps_all_waves.dta", clear

append using "temp/ukhls_all_waves.dta"

*Create unified id variable	
	replace pid=0 if pidp>0&pid<0
	replace pid=0 if pid==.&pidp>0
	egen id=group(pid pidp)

*Update union, job tenure, and education variables to reflect combination of ///
*BHPS and UKHLS
	xtset id wave
	
*Union
	forval i=2/26 {
		local max = `i'-1
		forval j=1/`max' {
			replace union=L`j'.union if wave==`i'&union==.& ///
employed==1&njbnew==1
			replace union=0 if employed==0&wave==`i'
		}
	}
	
*Job tenure	
	forval i=2/26 {
		local max = `i'-1
		forval j=1/`max' {
			replace jobtenure=L`j'.jobtenure+(year-L`j'.year) if (wave==`i'&jobtenure==.&employed==1&L`j'.jobtenure!=.&L`j'.jobtenure>=0)
			replace jobtenure=0 if employed==0&wave==`i'
		}
	}
	replace jobtenure=0 if employed==0&wave==1

*Education	
	forval i=2/26 {
		local max = `i'-1
		forval j=1/`max' {
			replace unified_education=L`j'.unified_education if wave==`i'&(unified_education==.|unified_education==0)&L`j'.unified_education!=.
			replace unified_education=F`j'.unified_education if wave==`i'&(unified_education==.|unified_education==0)&F`j'.unified_education!=.
		}
	}
		replace unified_education=F.unified_education if wave==1&(unified_education==.|unified_education==0)&F.unified_education!=.
	gen uni=1 if unified_education==1
	replace uni=0 if inrange(unified_education,2,3)

*General job satisfaction indicator
	gen satisfied=1 if jbsat==4|jbsat==5|jbsat==6|jbsat==7
	replace satisfied=0 if jbsat==1|jbsat==2|jbsat==3
	
*Drop Armed Forces
	drop if isco==0
	
*Drop negative ages
	drop if age<0
	
*Relative weights
	bys year: egen mean_xw=mean(xw)
	gen relative_xw=xw/mean_xw

cap erase "temp/bhps_all_waves.dta"
cap erase "temp/ukhls_all_waves.dta"
	
save "temp/uk_linear.dta", replace

////////////////////////////////////////////////////////////////////////////////
*Imputation procedure--translate UKHLS job security question into BHPS question
////////////////////////////////////////////////////////////////////////////////

cd "$ukdata"

use "temp/uk_linear.dta", clear
keep if inrange(wave,6,7)
gen insample=(employed==1&bhps_security>0&ukhls_security>0&!missing(temp,parttime, semp,semp_boss,age,age_squared,jobtenure,union,male,marital,uni,immigrant, nonwhite,isco)&semp==0&semp_boss==0)

save "clean/uk_translation.dta", replace

*Now carry out the imputation
use "clean/uk_translation.dta", clear
	drop if ukhls_security<1
	drop if bhps_security<1

*Opposite direction imputation
mlogit ukhls_security i.bhps_security age age_squared jobtenure i.male i.temp i.parttime i.union ib1.marital i.uni i.immigrant i.nonwhite ib1.isco if insample [pw=relative_xw], vce(robust) b(2)

*Predict probabilities of UKHLS response for BHPS sample
use "temp/uk_linear.dta", clear
	predict ukhls* if inrange(wave,1,18)
	gen bhps=inrange(wave,1,18)
	gen pred_ukhls_security=ukhls1+2*ukhls2+3*ukhls3+4*ukhls4 if bhps==1

gen stats=(employed==1&!missing(temp,parttime,semp,semp_boss,age,age_squared,jobtenure,union,male,marital,uni,immigrant,nonwhite,isco)& inlist(wave,20,22,24,26))

keep if !missing(ukhls1)|(ukhls_security>0&!missing(ukhls_security))|stats==1

gen insample=!missing(temp,parttime,age,age_squared,jobtenure,semp,semp_boss,union,male,marital,uni,immigrant,nonwhite,isco,year)&age<=65&employed==1&((bhps&bhps_security>0)|(!bhps&ukhls_security>0))&relative_xw!=0

keep id pid year wave temp parttime age age_squared jobtenure semp semp_boss union male marital uni immigrant nonwhite isco bhps bhps_security ukhls* relative_xw insample stats cjsbgy4 cjsbgd cjsbgm doim doid jbsat satisfied

*Save longitudinal-format dataset for summary statistics
save "clean/uk_linear_reverse.dta", replace

// cap erase "temp/uk_linear.dta"

*Transform data to Poisson format, reverse imputation direction
use "clean/uk_linear_reverse.dta", clear
	gen ob=_n
	expand 4
	bys ob: gen t=_n
	forval i=1/4 {
		gen v`i'=1 if t==`i'
		recode v`i' .=0
	}
	
	gen choice=.
	forval i=1/4 {
		replace choice=ukhls`i' if v`i'==1&!missing(ukhls`i')
		replace choice=1 if v`i'==1&ukhls_security==`i'
	}
	replace choice=0 if bhps==0&t!=ukhls_security

*Person-year id variable
	egen person_year_id=group(id wave)
	
save "clean/uk_poisson_reverse.dta", replace

////////////////////////////////////////////////////////////////////////////////

*UK unemployment rates

////////////////////////////////////////////////////////////////////////////////
cd "$ukdata"

import delimited "raw/uk_unrate.csv", clear
	drop in 1/8
	drop in 50/l
	rename v1 year
	destring year, replace
	rename v2 urate
	label var urate "Unemployment rate"
	label var year "Year"
	destring urate, replace
	replace urate=urate/100
save "clean/unrate.dta", replace

cd "$ukdata/temp"
local files: dir "`c(pwd)'" files "*"

foreach file of local files {
	cap erase `file'
}
